{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%pylab inline\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "!nvidia-smi\n",
    "import sys\n",
    "print(sys.path)\n",
    "import py3nvml ## pip install -e git+https://github.com/fbcotter/py3nvml#egg=py3nvml\n",
    "import subprocess\n",
    "import h2o4gpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read the training and testing datasets from disk\n",
    "from os.path import expanduser\n",
    "home = expanduser(\"~\")\n",
    "train_file = home + \"/h2o4gpu-benchmarks/Data/Homesite/train.csv\"\n",
    "test_file = home + \"/h2o4gpu-benchmarks/Data/Homesite/test.csv\"\n",
    "\n",
    "try:\n",
    "    train = pd.read_csv(train_file)\n",
    "    test = pd.read_csv(test_file)\n",
    "except:\n",
    "    print(\"\"\"\n",
    "          Data is not located in specified path. \\n\n",
    "          Data can be downloaded from below link:\\n\n",
    "          https://www.kaggle.com/c/homesite-quote-conversion/data\n",
    "          \"\"\")\n",
    "\n",
    "# Choose number of clusters\n",
    "k = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Data munging step - KMeans takes only numerical values\n",
    "train.drop(['QuoteConversion_Flag'], axis=1,inplace=True)\n",
    "dataset = pd.concat([train,test], ignore_index = True)\n",
    "tmp = dataset.dtypes.reset_index().rename(columns = {0:\"type\"})#[\"index\"]\n",
    "indx = tmp[\"type\"] == \"object\"\n",
    "categoricals = tmp[indx][\"index\"].tolist()\n",
    "\n",
    "# Replace nans as new category\n",
    "for col in dataset.columns:\n",
    "    dataset[col] = dataset[col].fillna(\"__NA__\")\n",
    "\n",
    "# Encode unfreq categories\n",
    "for col in categoricals:\n",
    "    val_dict = dataset[col].value_counts()\n",
    "    val_dict = dataset[col].value_counts().reset_index()\n",
    "    indx = val_dict[col] < 100\n",
    "    res = val_dict[indx][\"index\"].tolist()\n",
    "    indx = dataset[col].isin(res)\n",
    "    vals = dataset[col].values\n",
    "    vals[indx] = \"___UNFREQ___\"\n",
    "    dataset[col] = vals\n",
    "    \n",
    "# Encode all as freqs\n",
    "for col in categoricals:\n",
    "    val_dict = dataset[col].value_counts()\n",
    "    val_dict = val_dict / float(dataset.shape[0])\n",
    "    val_dict = val_dict.to_dict()\n",
    "    dataset[col] = dataset[col].apply(lambda x: val_dict[x])\n",
    "    \n",
    "# Replace nans as new category\n",
    "for col in dataset.columns:\n",
    "    dataset[col] = dataset[col].replace(\"__NA__\",0)\n",
    "    \n",
    "trainenc = dataset.iloc[:train.shape[0],:].reset_index(drop = True)\n",
    "testenc = dataset.iloc[train.shape[0]:,:].reset_index(drop = True)\n",
    "\n",
    "trainencflt = trainenc.values.astype(np.float32)\n",
    "testencflt = testenc.values.astype(np.float32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## H2O4GPU K-Means (single-GPU)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Fit an H2O4GPU KMeans model with 1 GPU. 300 iterations by default.\n",
    "model = h2o4gpu.KMeans(n_gpus=1, n_clusters=k)\n",
    "%time model.fit(trainencflt)\n",
    "\n",
    "#%time train_centroid_distance = model.transform(trainencflt)\n",
    "#%time train_labels     = model.predict(trainencflt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## H2O4GPU K-Means (multi-GPU)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Fit an H2O4GPU KMeans model with 2 GPUs. 300 iterations by default.\n",
    "model = h2o4gpu.KMeans(n_gpus=2, n_clusters=k)\n",
    "%time model.fit(trainencflt)\n",
    "\n",
    "#%time train_centroid_distance = model.transform(trainencflt)\n",
    "#%time train_labels     = model.predict(trainencflt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Scikit-Learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Fit a SciKit Learn KMeans model with all available cores.\n",
    "from sklearn.cluster import KMeans\n",
    "model = KMeans(n_clusters=k, n_init=1, n_jobs=-1)\n",
    "%time model.fit(trainencflt)\n",
    "#train_assignments = model.predict(trainencflt)\n",
    "#test_assignments = model.predict(testencflt)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": ".venv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
